import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
netflix = pd.read_csv('netflix_titles.csv')
data = netflix.copy()
netflix.shape
(8807, 12)
netflix.head()
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
netflix.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8807 entries, 0 to 8806 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 6173 non-null object 4 cast 7982 non-null object 5 country 7976 non-null object 6 date_added 8797 non-null object 7 release_year 8807 non-null int64 8 rating 8803 non-null object 9 duration 8804 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object dtypes: int64(1), object(11) memory usage: 825.8+ KB
netflix.isna().sum()
show_id 0 type 0 title 0 director 2634 cast 825 country 831 date_added 10 release_year 0 rating 4 duration 3 listed_in 0 description 0 dtype: int64
for i in netflix.columns:
na=netflix[i].isna().sum()
percent=na/len(netflix)*100
netflix = netflix.dropna(subset=['cast','country','date_added','rating','duration'],how='any')
netflix['director'] = netflix['director'].fillna(value='Not Found')
netflix.isna().sum()
show_id 0 type 0 title 0 director 0 cast 0 country 0 date_added 0 release_year 0 rating 0 duration 0 listed_in 0 description 0 dtype: int64
netflix.drop('description',axis=1,inplace=True)
netflix.date_added = pd.to_datetime(netflix.date_added)
def split_data(col1,col2):
data1 = data[col1].str.split(',',expand=False)
data2 = data[col2].squeeze()
df = pd.concat([data1,data2],axis=1).explode(col1).reset_index(drop=True)
df[col1] = df[col1].str.strip()
return df.value_counts().reset_index()
df1 = split_data('country','type').rename(columns={ 0 :"count"})
df2 = split_data('listed_in','type').rename(columns={ 0 :"count"})
df1
| country | type | count | |
|---|---|---|---|
| 0 | United States | Movie | 2752 |
| 1 | India | Movie | 962 |
| 2 | United States | TV Show | 938 |
| 3 | United Kingdom | Movie | 534 |
| 4 | Canada | Movie | 319 |
| ... | ... | ... | ... |
| 179 | Croatia | TV Show | 1 |
| 180 | Paraguay | Movie | 1 |
| 181 | Jamaica | Movie | 1 |
| 182 | Samoa | Movie | 1 |
| 183 | Liechtenstein | Movie | 1 |
184 rows × 3 columns
df2
| listed_in | type | count | |
|---|---|---|---|
| 0 | International Movies | Movie | 2752 |
| 1 | Dramas | Movie | 2427 |
| 2 | Comedies | Movie | 1674 |
| 3 | International TV Shows | TV Show | 1351 |
| 4 | Documentaries | Movie | 869 |
| 5 | Action & Adventure | Movie | 859 |
| 6 | TV Dramas | TV Show | 763 |
| 7 | Independent Movies | Movie | 756 |
| 8 | Children & Family Movies | Movie | 641 |
| 9 | Romantic Movies | Movie | 616 |
| 10 | TV Comedies | TV Show | 581 |
| 11 | Thrillers | Movie | 577 |
| 12 | Crime TV Shows | TV Show | 470 |
| 13 | Kids' TV | TV Show | 451 |
| 14 | Docuseries | TV Show | 395 |
| 15 | Music & Musicals | Movie | 375 |
| 16 | Romantic TV Shows | TV Show | 370 |
| 17 | Horror Movies | Movie | 357 |
| 18 | Stand-Up Comedy | Movie | 343 |
| 19 | Reality TV | TV Show | 255 |
| 20 | British TV Shows | TV Show | 253 |
| 21 | Sci-Fi & Fantasy | Movie | 243 |
| 22 | Sports Movies | Movie | 219 |
| 23 | Anime Series | TV Show | 176 |
| 24 | Spanish-Language TV Shows | TV Show | 174 |
| 25 | TV Action & Adventure | TV Show | 168 |
| 26 | Korean TV Shows | TV Show | 151 |
| 27 | Classic Movies | Movie | 116 |
| 28 | LGBTQ Movies | Movie | 102 |
| 29 | TV Mysteries | TV Show | 98 |
| 30 | Science & Nature TV | TV Show | 92 |
| 31 | TV Sci-Fi & Fantasy | TV Show | 84 |
| 32 | TV Horror | TV Show | 75 |
| 33 | Anime Features | Movie | 71 |
| 34 | Cult Movies | Movie | 71 |
| 35 | Teen TV Shows | TV Show | 69 |
| 36 | Faith & Spirituality | Movie | 65 |
| 37 | Movies | Movie | 57 |
| 38 | TV Thrillers | TV Show | 57 |
| 39 | Stand-Up Comedy & Talk Shows | TV Show | 56 |
| 40 | Classic & Cult TV | TV Show | 28 |
| 41 | TV Shows | TV Show | 16 |
df3 = netflix.groupby(['rating']).size().reset_index(name='counts')
pieChart = px.pie(df3, values='counts', names='rating',
title='Distribution of Content Ratings on Netflix')
pieChart.show()
df3
| rating | counts | |
|---|---|---|
| 0 | G | 40 |
| 1 | NC-17 | 3 |
| 2 | NR | 62 |
| 3 | PG | 275 |
| 4 | PG-13 | 470 |
| 5 | R | 779 |
| 6 | TV-14 | 1755 |
| 7 | TV-G | 158 |
| 8 | TV-MA | 2657 |
| 9 | TV-PG | 653 |
| 10 | TV-Y | 209 |
| 11 | TV-Y7 | 222 |
| 12 | TV-Y7-FV | 4 |
| 13 | UR | 3 |
netflix['cast']=netflix['cast'].fillna('No Cast Specified')
filtered_cast=pd.DataFrame()
filtered_cast=netflix['cast'].str.split(',',expand=True).stack()
filtered_cast=filtered_cast.to_frame()
filtered_cast.columns=['Actor']
actors=filtered_cast.groupby(['Actor']).size().reset_index(name='Total Content')
actors=actors[actors.Actor !='No Cast Specified']
actors=actors.sort_values(by=['Total Content'],ascending=False)
actorsTop5=actors.head()
actorsTop5=actorsTop5.sort_values(by=['Total Content'])
actorsTop5
| Actor | Total Content | |
|---|---|---|
| 36137 | Shah Rukh Khan | 25 |
| 22568 | Paresh Rawal | 25 |
| 22277 | Om Puri | 27 |
| 28537 | Takahiro Sakurai | 28 |
| 2461 | Anupam Kher | 39 |
temp = list()
clean_data = data.dropna()
clean_data.reset_index(inplace=True)
for ind, element in clean_data.iterrows():
type_show = element['type']
for director in str(element['director']).split(','):
temp.append([type_show, director])
director_data = pd.DataFrame(temp, columns= ['type', 'director'])
director_data
| type | director | |
|---|---|---|
| 0 | Movie | Haile Gerima |
| 1 | TV Show | Andy Devonshire |
| 2 | Movie | Theodore Melfi |
| 3 | Movie | Christian Schwochow |
| 4 | Movie | S. Shankar |
| ... | ... | ... |
| 5955 | Movie | Majid Al Ansari |
| 5956 | Movie | David Fincher |
| 5957 | Movie | Ruben Fleischer |
| 5958 | Movie | Peter Hewitt |
| 5959 | Movie | Mozez Singh |
5960 rows × 2 columns
director_data_count = director_data.value_counts().to_frame()
director_data_count.reset_index(level=[0,1], inplace=True)
famous_director = director_data_count.rename(columns={0:'count'})
famous_director
| type | director | count | |
|---|---|---|---|
| 0 | Movie | Jan Suter | 18 |
| 1 | Movie | Raúl Campos | 18 |
| 2 | Movie | Jay Karas | 15 |
| 3 | Movie | Marcus Raboy | 14 |
| 4 | Movie | Cathy Garcia-Molina | 13 |
| ... | ... | ... | ... |
| 4443 | Movie | Tony Bancroft | 1 |
| 4444 | Movie | Lasja Fauzia Susatyo | 1 |
| 4445 | Movie | Tony Datis | 1 |
| 4446 | Movie | Lars Klevberg | 1 |
| 4447 | TV Show | Ziad Doueiri | 1 |
4448 rows × 3 columns
for unique_type in famous_director['type'].unique():
bar, ax = plt.subplots(figsize=(10,10))
sns.barplot(x = 'director', y = 'count', data = famous_director[famous_director['type'] == unique_type].iloc[:5])
plt.xlabel('Director in {}'.format(str(unique_type)))
plt.ylabel('Frequency')
plt.title('Famous Director in {}'.format(str(unique_type)), size=20)
movie_data = netflix[netflix['type'] == 'Movie']
tv_show_data = netflix[netflix['type'] == 'TV Show']
# bar,ax = plt.subplots(1,2,figsize=(10,10))
temp = netflix[['type', 'release_year']]
temp = temp.value_counts().to_frame()
temp.reset_index(level=[0,1], inplace=True)
temp = temp.rename(columns = {0:'count'})
temp = pd.concat([temp[temp['type'] == 'Movie'][:5], temp[temp['type']== 'TV Show'][:5]])
# ax, bar = plt.subplots(figsize = (10,10))
sns.catplot(x = 'release_year', y = 'count', hue = 'type', data = temp, kind = 'point')
plt.xlabel('Release Year')
plt.ylabel('Frequency')
plt.title('Growth of Movie/TV Show over Years', size=14)
Text(0.5, 1.0, 'Growth of Movie/TV Show over Years')
df4=netflix[['type','release_year']]
df4=df4.rename(columns={"release_year": "Release Year"})
df5=df4.groupby(['Release Year','type']).size().reset_index(name='Total Content')
df5=df5[df5['Release Year']>=2010]
df5
| Release Year | type | Total Content | |
|---|---|---|---|
| 90 | 2010 | Movie | 140 |
| 91 | 2010 | TV Show | 29 |
| 92 | 2011 | Movie | 137 |
| 93 | 2011 | TV Show | 37 |
| 94 | 2012 | Movie | 162 |
| 95 | 2012 | TV Show | 49 |
| 96 | 2013 | Movie | 196 |
| 97 | 2013 | TV Show | 53 |
| 98 | 2014 | Movie | 237 |
| 99 | 2014 | TV Show | 73 |
| 100 | 2015 | Movie | 344 |
| 101 | 2015 | TV Show | 128 |
| 102 | 2016 | Movie | 574 |
| 103 | 2016 | TV Show | 177 |
| 104 | 2017 | Movie | 649 |
| 105 | 2017 | TV Show | 213 |
| 106 | 2018 | Movie | 653 |
| 107 | 2018 | TV Show | 282 |
| 108 | 2019 | Movie | 513 |
| 109 | 2019 | TV Show | 309 |
| 110 | 2020 | Movie | 417 |
| 111 | 2020 | TV Show | 327 |
| 112 | 2021 | Movie | 152 |
| 113 | 2021 | TV Show | 183 |
df4=netflix[['type','release_year']]
df4=df4.rename(columns={"release_year": "Release Year"})
df5=df4.groupby(['Release Year','type']).size().reset_index(name='Total Content')
df5=df5[df5['Release Year']>=2010]
fig3 = px.line(df5, x="Release Year", y="Total Content", color='type',title='Trend of content produced over the year')
fig3.show()